package com.linghang.wusthelper.spider.jwc;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * @author origin
 * 教务处公告内容爬虫
 */
public class JwcAnnContentSpider implements PageProcessor {

    private StringBuilder content;

    private Site site = Site.me().setRetryTimes(2).setTimeOut(3000)
            .addHeader("Referer","http://jwc.wust.edu.cn/1925/list1.htm")
            .addHeader("Upgrade-Insecure-Requests","1")
            .addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36");

    public JwcAnnContentSpider(StringBuilder content){
        this.content = content;
    }

    @Override
    public void process(Page page) {
        String str = page.getHtml().xpath("//*[@id=\"container\"]/div").get();
        //  正则替换域名,文件地址 (不需要 爬取的时候会自动加上 我也不知道为什么！！！)
        //String x = str.replaceAll("<a href=\"","<a href=\"http://jwc.wust.edu.cn");
        synchronized (content){
            content.append(str);
            content.notify();
        }
    }

    @Override
    public Site getSite() {
        return site;
    }
}
